In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import panel as pn
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
pd.options.plotting.backend = "hvplot"
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# init_notebook_mode(connected=True)
# cf.go_offline()
from panel.template import DefaultTheme

pn.extension()

path = "/Users/glebsokolov/Downloads/Online Retail.xlsx"
In [9]:
df = pd.read_excel(path)
df.head(5)
df.info()
Out[9]:
See Full Dataframe in Mito
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 08:26:00 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 08:26:00 2.75 17850.0 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB

To fasten loading of this dataset in future, convert it to feather format.

In [22]:
df[['StockCode', 'InvoiceNo', 'Description']] = df[['StockCode', 'InvoiceNo', 'Description']].astype(str)
df.to_feather('onlineRetail.feather')
In [2]:
df = pd.read_feather('onlineRetail.feather')
df.head(10).hvplot.table()
Out[2]:
In [4]:
df.isnull().sum()
df = df.dropna(subset=["CustomerID"])
df.isnull().sum().sum()
Out[4]:
InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64
Out[4]:
0
In [5]:
df.duplicated().sum()
df = df.drop_duplicates()
df.duplicated().sum()
Out[5]:
5225
Out[5]:
0
In [3]:
df.describe().hvplot.table()
Out[3]:

Let's get rid of ridiculous values.

In [4]:
df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]
df.describe().hvplot.table()
df.shape
Out[4]:
Out[4]:
(530104, 8)

For cohort analysis, there are a few labels that we have to create:

  • Invoice period: A string representation of the year and month of a single transaction/invoice.
  • Cohort group: A string representation of the the year and month of a customer’s first purchase. This label is common across all invoices for a particular customer.
  • Cohort period / Cohort Index: A integer representation a customer’s stage in its “lifetime”. The number represents the number of months passed since the first purchase.
In [5]:
def get_month(x) : return dt.datetime(x.year,x.month,1)
df['InvoiceMonth'] = df['InvoiceDate'].apply(get_month)
grouping = df.groupby('CustomerID')['InvoiceMonth']
df['CohortMonth'] = grouping.transform('min')
df.tail().hvplot.table(width=1000)
Out[5]:
In [6]:
def get_month_int (dframe,column):
    year = dframe[column].dt.year
    month = dframe[column].dt.month
    day = dframe[column].dt.day
    return year, month , day 

invoice_year,invoice_month,_ = get_month_int(df,'InvoiceMonth')
cohort_year,cohort_month,_ = get_month_int(df,'CohortMonth')

year_diff = invoice_year - cohort_year 
month_diff = invoice_month - cohort_month 

df['CohortIndex'] = year_diff * 12 + month_diff + 1 
In [7]:
#Count monthly active customers from each cohort
grouping = df.groupby(['CohortMonth', 'CohortIndex'])
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique)
# Return number of unique elements in the object.
cohort_data = cohort_data.reset_index()
cohort_counts = cohort_data.pivot(index='CohortMonth',columns='CohortIndex',values='CustomerID')
cohort_counts.hvplot.table()
Out[7]:
In [8]:
# Retention table
cohort_size = cohort_counts.iloc[:,0]
retention = cohort_counts.divide(cohort_size,axis=0) #axis=0 to ensure the divide along the row axis 
(retention.round(3) * 100).hvplot.table() #to show the number as percentage 
Out[8]:
In [11]:
retention.plot(kind='heatmap', title='Retention Rates', width=600)
Out[11]:
In [33]:
#Build the heatmap
plt.figure(figsize=(15, 8))
plt.title('Retention rates')
sns.heatmap(data=retention,annot = True,fmt = '.0%',vmin = 0.0,vmax = 0.5,cmap="BuPu_r")
plt.show()
Out[33]:
<Figure size 1080x576 with 0 Axes>
Out[33]:
Text(0.5, 1.0, 'Retention rates')
Out[33]:
<AxesSubplot:title={'center':'Retention rates'}, xlabel='CohortIndex', ylabel='CohortMonth'>
In [12]:
#Average quantity for each cohort
grouping = df.groupby(['CohortMonth', 'CohortIndex'])
cohort_data = grouping['Quantity'].mean()
cohort_data = cohort_data.reset_index()
average_quantity = cohort_data.pivot(index='CohortMonth',columns='CohortIndex',values='Quantity')
average_quantity.round(1).hvplot.table()
average_quantity.index = average_quantity.index.date

#Build the heatmap
plt.figure(figsize=(15, 8))
plt.title('Average quantity for each cohort')
sns.heatmap(data=average_quantity,annot = True,vmin = 0.0,vmax =20,cmap="BuGn_r")
plt.show()
Out[12]:
Out[12]:
<Figure size 1080x576 with 0 Axes>
Out[12]:
Text(0.5, 1.0, 'Average quantity for each cohort')
Out[12]:
<AxesSubplot:title={'center':'Average quantity for each cohort'}, xlabel='CohortIndex'>
In [13]:
#New Total Sum Column  
df['TotalSum'] = df['UnitPrice']* df['Quantity']

#Data preparation steps
print('Min Invoice Date:',df.InvoiceDate.dt.date.min(),'max Invoice Date:',
       df.InvoiceDate.dt.date.max())

df.head(3).hvplot.table()
Min Invoice Date: 2010-12-01 max Invoice Date: 2011-12-09
Out[13]:
In [14]:
snapshot_date = df['InvoiceDate'].max() + dt.timedelta(days=1)
snapshot_date
#The last day of purchase in total is 09 DEC, 2011. To calculate the day periods, 
#let's set one day after the last one,or 
#10 DEC as a snapshot_date. We will cound the diff days with snapshot_date.
Out[14]:
Timestamp('2011-12-10 12:50:00')
In [15]:
# Calculate RFM metrics
rfm = df.groupby(['CustomerID']).agg({'InvoiceDate': lambda x : (snapshot_date - x.max()).days,
                                      'InvoiceNo':'count','TotalSum': 'sum'})

#Rename columns
rfm.rename(columns={'InvoiceDate':'Recency','InvoiceNo':'Frequency','TotalSum':'MonetaryValue'}
           ,inplace= True)

#Final RFM values
rfm.head().hvplot.table()
Out[15]:

Note That :

  • We will rate "Recency" customer who have been active more recently better than the less recent customer,because each company wants its customers to be recent

  • We will rate "Frequency" and "Monetary Value" higher label because we want Customer to spend more money and visit more often(that is different order than recency).

In [16]:
#Building RFM segments
r_labels =range(4,0,-1)
f_labels=range(1,5)
m_labels=range(1,5)
r_quartiles = pd.qcut(rfm['Recency'], q=4, labels = r_labels)
f_quartiles = pd.qcut(rfm['Frequency'],q=4, labels = f_labels)
m_quartiles = pd.qcut(rfm['MonetaryValue'],q=4,labels = m_labels)
rfm = rfm.assign(R=r_quartiles,F=f_quartiles,M=m_quartiles)

# Build RFM Segment and RFM Score
def add_rfm(x) : return str(int(x['R'])) + str(int(x['F'])) + str(int(x['M']))
rfm['RFM_Segment'] = rfm.apply(add_rfm,axis=1 )
rfm['RFM_Score'] = rfm[['R','F','M']].sum(axis=1)

rfm.head().hvplot.table()
Out[16]:

The Result is a Table which has a row for each customer with their RFM

In [17]:
rfm.groupby(['RFM_Segment']).size().sort_values(ascending=False)[:5]
#Select bottom RFM segment "111" and view top 5 rows
rfm[rfm['RFM_Segment']=='111'].head().hvplot.table()
Out[17]:
RFM_Segment
444    447
111    385
344    217
122    206
211    179
dtype: int64
Out[17]:
In [18]:
rfm.groupby("RFM_Score").agg(
    {"Recency": "mean", "Frequency": "mean", "MonetaryValue": ["mean", "count"]}
).round(1)


def segments(df):
    if df["RFM_Score"] > 9:
        return "Gold"
    elif (df["RFM_Score"] > 5) and (df["RFM_Score"] <= 9):
        return "Sliver"
    else:
        return "Bronze"


rfm["General_Segment"] = rfm.apply(segments, axis=1)

rfm.groupby("General_Segment").agg(
    {"Recency": "mean", "Frequency": "mean", "MonetaryValue": ["mean", "count"]}
).round(1)
Out[18]:
See Full Dataframe in Mito
Recency Frequency MonetaryValue
mean mean mean count
RFM_Score
3 260.8 8.2 159.3 385
4 175.4 13.6 239.4 382
5 154.3 21.4 368.5 516
6 96.5 27.9 825.7 460
7 79.1 38.5 755.5 459
8 64.6 56.8 994.1 454
9 46.0 80.0 1801.4 417
10 32.1 112.3 2049.4 426
11 21.1 187.7 4083.2 392
12 7.3 374.4 9319.2 447
Out[18]:
See Full Dataframe in Mito
Recency Frequency MonetaryValue
mean mean mean count
General_Segment
Bronze 192.6 15.2 267.3 1283
Gold 19.9 228.3 5248.5 1265
Sliver 72.2 50.1 1077.7 1790

Data Pre-Processing for Kmeans Clustering¶

We must check these Key k-means assumptions before we implement our Kmeans Clustering Algo

  • Symmetric distribution of variables (not skewed)
  • Variables with same average values
  • Variables with same variance
In [19]:
rfm_rfm = rfm[['Recency','Frequency','MonetaryValue']]
print(rfm_rfm.describe())
           Recency    Frequency  MonetaryValue
count  4338.000000  4338.000000    4338.000000
mean     92.536422    91.720609    2054.266460
std     100.014169   228.785094    8989.230441
min       1.000000     1.000000       3.750000
25%      18.000000    17.000000     307.415000
50%      51.000000    41.000000     674.485000
75%     142.000000   100.000000    1661.740000
max     374.000000  7847.000000  280206.020000

From this table, we find this Problem: Mean and Variance are not Equal

Soluation: Scaling variables by using a scaler from scikit-learn library

In [20]:
theme = 'ggplot2'
rfm['Recency'].plot(kind='hist') + rfm['Frequency'].plot(kind='hist') + rfm['MonetaryValue'].plot(kind='hist')
Out[20]:

Also, there is another Problem: UnSymmetric distribution of variables (data skewed)

Soluation:Logarithmic transformation (positive values only) will manage skewness

In [21]:
#Unskew the data with log transformation
rfm_log = rfm[['Recency', 'Frequency', 'MonetaryValue']].apply(np.log, axis = 1).round(3)
rfm_log['Recency'].plot(kind='hist') + rfm_log['Frequency'].plot(kind='hist') + rfm_log['MonetaryValue'].plot(kind='hist')
Out[21]:

Implementation of K-Means Clustering¶

Key steps¶

  • Data pre-processing
  • Choosing a number of clusters
  • Running k-means clustering on pre-processed data
  • Analyzing average RFM values of each cluster
In [22]:
#Normalize the variables with StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(rfm_log)
#Store it separately for clustering
rfm_normalized= scaler.transform(rfm_log)
pd.DataFrame(rfm_normalized).describe()
Out[22]:
StandardScaler()
Out[22]:
See Full Dataframe in Mito
0 1 2
count 4.338000e+03 4.338000e+03 4.338000e+03
mean 7.862160e-17 -2.096576e-16 6.355246e-16
std 1.000115e+00 1.000115e+00 1.000115e+00
min -2.630421e+00 -2.775156e+00 -4.179095e+00
25% -6.126634e-01 -6.385864e-01 -6.843467e-01
50% 1.148465e-01 2.583917e-02 -6.090769e-02
75% 8.297890e-01 6.978064e-01 6.543418e-01
max 1.505633e+00 3.988259e+00 4.721171e+00
  1. Choosing a Number of Clusters

    Methods to define the number of clusters

  • Visual methods - elbow criterion
  • Mathematical methods - silhouette coefficient
  • Experimentation and interpretation

Elbow criterion method

  • Plot the number of clusters against within-cluster sum-of-squared-errors (SSE) - sum of squared distances from every data point to their cluster center
  • Identify an "elbow" in the plot
  • Elbow - a point representing an "optimal" number of clusters
In [23]:
from sklearn.cluster import KMeans

# First : Get the Best KMeans
ks = range(1, 8)
inertias = []
for k in ks:
    # Create a KMeans clusters
    kc = KMeans(n_clusters=k, random_state=1)
    kc.fit(rfm_normalized)
    inertias.append(kc.inertia_)
pd.DataFrame({"ks": ks, "inertias": inertias}).plot(
    x="ks",
    y="inertias",
    kind="line",
    title="What is the Best Number for KMeans ?",
    width=800,
)
Out[23]:
KMeans(n_clusters=1, random_state=1)
Out[23]:
KMeans(n_clusters=2, random_state=1)
Out[23]:
KMeans(n_clusters=3, random_state=1)
Out[23]:
KMeans(n_clusters=4, random_state=1)
Out[23]:
KMeans(n_clusters=5, random_state=1)
Out[23]:
KMeans(n_clusters=6, random_state=1)
Out[23]:
KMeans(n_clusters=7, random_state=1)
Out[23]:
In [65]:
# clustering
kc = KMeans(n_clusters= 3, random_state=1)
kc.fit(rfm_normalized)

#Create a cluster label column in the original DataFrame
cluster_labels = kc.labels_

#Calculate average RFM values and size for each cluster:
rfm_rfm_k3 = rfm_rfm.assign(K_Cluster = cluster_labels)

#Calculate average RFM values and sizes for each cluster:
rfm_rfm_k3.groupby('K_Cluster').agg({'Recency': 'mean','Frequency': 'mean',
                                         'MonetaryValue': ['mean', 'count'],}).round(0)
Out[65]:
KMeans(n_clusters=3, random_state=1)
Out[65]:
See Full Dataframe in Mito
Recency Frequency MonetaryValue
mean mean mean count
K_Cluster
0 171.0 15.0 293.0 1523
1 69.0 65.0 1167.0 1859
2 13.0 260.0 6559.0 956

Snake plots to understand and compare segments

  • Market research technique to compare different segments
  • Visual representation of each segment's attributes
  • Need to first normalize data (center & scale)
  • Plot each cluster's average normalized values of each attribute
In [66]:
rfm_normalized = pd.DataFrame(rfm_normalized,index=rfm_rfm.index,columns=rfm_rfm.columns)
rfm_normalized['K_Cluster'] = kc.labels_
rfm_normalized['General_Segment'] = rfm['General_Segment']
rfm_normalized.reset_index(inplace = True)

#Melt the data into a long format so RFM values and metric names are stored in 1 column each
rfm_melt = pd.melt(rfm_normalized,id_vars=['CustomerID','General_Segment','K_Cluster'],value_vars=['Recency', 'Frequency', 'MonetaryValue'],
var_name='Metric',value_name='Value')
rfm_melt.head()
Out[66]:
See Full Dataframe in Mito
CustomerID General_Segment K_Cluster Metric Value
0 12346.0 Sliver 1 Recency 1.409982
1 12347.0 Gold 2 Recency -2.146578
2 12348.0 Sliver 1 Recency 0.383648
3 12349.0 Gold 1 Recency -0.574961
4 12350.0 Bronze 0 Recency 1.375072
In [67]:
f, (ax1, ax2) = plt.subplots(1,2, figsize=(15, 8))
sns.lineplot(x = 'Metric', y = 'Value', hue = 'General_Segment', data = rfm_melt,ax=ax1)

# a snake plot with K-Means
sns.lineplot(x = 'Metric', y = 'Value', hue = 'K_Cluster', data = rfm_melt,ax=ax2)

plt.suptitle("Snake Plot of RFM",fontsize=24) #make title fontsize subtitle 
plt.show()
Out[67]:
<AxesSubplot:xlabel='Metric', ylabel='Value'>
Out[67]:
<AxesSubplot:xlabel='Metric', ylabel='Value'>
Out[67]:
Text(0.5, 0.98, 'Snake Plot of RFM')

Relative importance of segment attributes

Useful technique to identify relative importance of each segment's attribute

  • Calculate average values of each cluster
    • Calculate average values of population
  • Calculate importance score by dividing them and subtracting 1 (ensures 0 is returned when cluster average equals population average)

Let’s try again with a heat map. Heat maps are a graphical representation of data where larger values were colored in darker scales and smaller values in lighter scales. We can compare the variance between the groups quite intuitively by colors.

In [68]:
# The further a ratio is from 0, the more important that attribute is for a segment relative to the total population
cluster_avg = rfm_rfm_k3.groupby(['K_Cluster']).mean()
population_avg = rfm_rfm.mean()
relative_imp = cluster_avg / population_avg - 1
relative_imp.round(2)
Out[68]:
See Full Dataframe in Mito
Recency Frequency MonetaryValue
K_Cluster
0 0.85 -0.84 -0.86
1 -0.25 -0.28 -0.43
2 -0.86 1.87 2.20
In [69]:
# the mean value in total 
total_avg = rfm.iloc[:, 0:3].mean()
# calculate the proportional gap with total mean
cluster_avg = rfm.groupby('General_Segment').mean().iloc[:, 0:3]
prop_rfm = cluster_avg/total_avg - 1
prop_rfm.round(2)
Out[69]:
See Full Dataframe in Mito
Recency Frequency MonetaryValue
General_Segment
Bronze 1.08 -0.83 -0.87
Gold -0.78 1.49 1.56
Sliver -0.22 -0.45 -0.48
In [70]:
# heatmap with RFM
f, (ax1, ax2) = plt.subplots(1,2, figsize=(15, 5))
sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='Blues',ax=ax1)
ax1.set(title = "Heatmap of K-Means")

# a snake plot with K-Means
sns.heatmap(prop_rfm, cmap= 'Oranges', fmt= '.2f', annot = True,ax=ax2)
ax2.set(title = "Heatmap of RFM quantile")

plt.suptitle("Heat Map of RFM",fontsize=20) #make title fontsize subtitle 

plt.show()
Out[70]:
<AxesSubplot:ylabel='K_Cluster'>
Out[70]:
[Text(0.5, 1.0, 'Heatmap of K-Means')]
Out[70]:
<AxesSubplot:ylabel='General_Segment'>
Out[70]:
[Text(0.5, 1.0, 'Heatmap of RFM quantile')]
Out[70]:
Text(0.5, 0.98, 'Heat Map of RFM')
In [ ]: